R tips

packages

# data processing
install.packages("tidyverse")
install.packages("data.table")

# date
install.packages("lubridate")

# visualization
install.packages("psych")
install.packages("scales")
install.packages("GGally") # http://www.ggplot2-exts.org/gallery/

# dev
install.packages("devtools")
install.packages("testthat")
install.packages("roxygen2")

# machine learning
install.packages("caret")
#install.packages("ranger")
install.packages("randomForest")
install.packages("xgboost")

# recommendation
install.packages("arules")
install.packages("arulesViz")

# stats modeling
install.packages("glmnet")

# time series
install.packages("xts")
install.packages("forecast")
install.packages("zoo")

# web app
install.packages("shiny")

# reporting
install.packages("rmarkdown")
install.packages("formatR") # R notebook

# bayse
install.packages("rstan")
#install.packages("dlm")

install.packages("")

Paste from Clip Board

# win/Linux?
data <- read.table("clipboard")

# mac
data <- read.table(pipe("pbpaste"))

basic functions

# rangeではない(python脳, range -> output min/max
seq(1, 9) # 1 to 9
seq(1, 9, by = 0.1) # step
seq(1, 9, length.out = 5) # n = 5

# log
log(2, base = exp(1))

# shuffle/sampling
sample(1:5)
sample(1:5, 3)
sample(1:5, 10, replace=TRUE)

# zero matrix
matrix(0, nrow=2, ncol=2)

# identity matrix
diag(nrow=2)

repeat / replicate

rep(1:3, 2)
[1] 1 2 3 1 2 3

# byrowにしたいときはt() で転置
replicate(2, 1:3)
     [,1] [,2]
[1,]    1    1
[2,]    2    2
[3,]    3    3

sub(tract) vector from matrix

(mat <- matrix(1:6, ncol=2))
     [,1] [,2]
[1,]    1    4
[2,]    2    5
[3,]    3    6

# MARGIN: 1 -> row, 2 -> col
sweep(mat, 2, mat[1, ])
     [,1] [,2]
[1,]    0    0
[2,]    1    1
[3,]    2    2

# matrixではなくdataframeの場合、data.matrixで変換してから対応する
# matrixにしてもカラムは消えない

apply系

# MARGIN = c(1,2) のとき成分ごとに適用
# sumはnumpyと同じで1つのスカラーに集約してしまうので、applyが必要
apply(df, MARGIN, sum)

dot/outer product

1:3 %*% 1:3
     [,1]
[1,]   14

1:3 %o% 1:3
     [,1] [,2] [,3]
[1,]    1    2    3
[2,]    2    4    6
[3,]    3    6    9

list

# append
l <- list()
l <- c(l, list(1))
l <- c(l, list(2), list(3))

dummy

install.packages("dummies")
library(dummies)
a <- 1:5
df <- data.frame(a=a)

dummies::dummy(a)
dummies::dummy("a", df)

duplicated / drop duplicated

df <- data.frame(d1=c(1,1,2,3,3,4,5,6,6),
                 d2=c(1,1,2,3,3,4,5,5,6))
df$df_duplicated          <- duplicated(df[, c("d1", "d2")])
df$d1_duplicated          <- duplicated(df$d1)
df$d1_duplicated_fromlast <- duplicated(df$d1, fromLast = T)
df$d1_duplicated_all      <- duplicated(df$d1) | duplicated(df$d1, fromLast = T)
# => https://stackoverflow.com/questions/7854433/finding-all-duplicate-rows-including-elements-with-smaller-subscripts

# drop duplicated
unique(df$d1)
df %>% select(d1, d2) %>% distinct()

# drop duplicated/base d1 and drop d2
# drop duplicated/base d1 and keep d2
df %>% select(d1, d2) %>% distinct(d1)
df %>% select(d1, d2) %>% distinct(d1, .keep_all = T)

標準化

scale(iris[, -5]) == scale(iris[, -5], center = T, scale = T)
# center, scaleは標準化でなくてもよい
# 上記2つに数値を渡すこともできるので、min-max scaleもできる

PCA

# デフォルトの変換対象は分散共分散行列(center = T, scale = F)
iris_pca <- prcomp(iris[, -5])

# 固有分解
iris_cov_eigen <- eigen(cov(iris[, -5]))
## 固有ベクトル
iris_pca$rotation == iris_cov_eigen$vectors
## 固有値
iris_pca$sdev ^ 2 == iris_cov_eigen$values

# 変換後データ と 元の行列
iris_pca$x
scale(iris[, -5], scale = F) == (iris_pca$x %*% t(iris_pca$rotation))

# 寄与率
iris_pca_summary <- summary(iris_pca)
iris_pca_summary

docallによるlist展開

# cbindにベクトルを渡す
do.call(cbind, list(1:2, 3:4, 5:6))
     [,1] [,2] [,3]
[1,]    1    3    5
[2,]    2    4    6

## 引数が足りない場合
myfunc <- function(a, b, c, bias1, bias2 = 2) {
  return(sum(a, b, c) + bias1 + bias2)
}
do.call(myfunc, list(1:2, 3:4, bias1 = 1))
 (function (a, b, c, bias1, bias2 = 2)  でエラー:
   引数 "c" がありませんし、省略時既定値もありません

## デフォルト引数
do.call(myfunc, list(1:2, 3:4, 5:6, bias1 = 1))
[1] 24

## 可変長引数関数
variadic_func <- function(..., bias1, bias2 = 2) {
  return(sum(...) + bias1 + bias2)
}
do.call(variadic_func, list(1:2, 3:4, bias1 = 1))
[1] 13
do.call(variadic_func, list(1, 2, 3, 4, 5, bias1 = 1))
[1] 18

ggplot

library(ggplot2)
library(gridExtra)

# 複数グラフ
plots <- list()
plots[[1]] <- ggplot2::ggplot() +
  geom_point(aes(x, y1), data = df)
plots[[2]] <- ggplot2::ggplot() +
  geom_point(aes(x, y2), data = df)

gridExtra::marrangeGrob(plots, nrow=1, ncol=2)
# or
do.call(gridExtra::grid.arrange, c(plots, nrow=1, ncol=2))

# violin
geom_violin

# for pca
install.packages("ggbiplot")
library(ggbiplot)

time series

data("AirPassengers")
class(AirPassengers)
# => [1] "ts"
tsp(AirPassengers)
# => start end frequency

# plot
plot(AirPassengers)
decomposed <-decompose(AirPassengers)
plot(decomposed)
# observed, trend, seasonal, random

## 変化率 log近似とdefault
plot(diff(log(AirPassengers)))
plot(diff(AirPassengers) / AirPassengers)

## (偏)自己相関
acf(AirPassengers)
pacf(AirPassengers)

# TODO

Rで共分散と相関係数を求める(自作と {stats})

## 数式をプログラムに落とし込む練習
# Σとか行列・ベクトルを見ても怖がらないように
# 落ち着けば問題なさそう
# 作っているうちにイメージしやすくなった
# forがダサく見える…
# もっといい方法があったら知りたい

v1 <- 1:9
v2 <- sample(v1)
m <- matrix(sample(v2), 3)

# cor
(function(m){
  len <- dim(m)[1]
  mat.cor <- diag(rep(1, len))
  for (k in 1:len) {
    for (j in 1:len) {
      if (k == j) next
      if (k > j) {
        cor.kj <- mat.cor[j, k]
      } else {
        d1 <- m[, k] - mean(m[, k])
        d2 <- m[, j] - mean(m[, j])
        cor.kj <- sum(d1 * d2) / (sqrt(sum(d1 ** 2)) * sqrt(sum(d2 ** 2)))
      }
      mat.cor[k, j] <- cor.kj
    }
  }
  return(mat.cor)
})(m)
cor(m)

# cov
(function(m){
  len <- dim(m)[1]
  mat <- matrix(0, len, len)
  for (k in 1:len) {
    for (j in 1:len) {
      d1 <- m[, k] - mean(m[, k])
      d2 <- m[, j] - mean(m[, j])
      mat[k, j] <- sum(d1 * d2)
    }
  }
  return(mat / (len - 1))
})(m)
cov(m)

Rで変数削除

library(dplyr)
library(caret)

df.filtered <- df %>%
  select(-caret::nearZeroVar(df)) %>%
  select(-caret::findCorrelation(cor(.), cutoff = 任意の閾値))

# おまけ
# tidyr はたいでぃーあーる